* Steps: 
** 1) PROJECTS DATA
** 2) CREATE DISTRICT-YEAR LEVEL DATA
** 3) MERGE ALL DISTRICT-LEVEL DATA
** 4) GENERATE TREATMENT VARS


*------------------------------------------------------------------------------
*1) PROJECT-LEVEL DATA
*------------------------------------------------------------------------------		
use "$data/mvcs_devengado_analysis.dta", clear

*Fewer onsite in places where sewerage was built?
preserve
collapse (sum) p_sewerage p_latrines, by(ubigeo)
pwcorr p_latrines p_sewerage, sig
/*
             | p_latr~s p_sewe~e
-------------+------------------
  p_latrines |   1.0000 
             |
             |
  p_sewerage |   0.0979   1.0000 
             |   0.0001
             |
*/
restore

*Keep only sewerage projects
keep if p_sewerage==1

*What types of sewerage projects are constructed?
*----------------------------------------------------
*Septic or treatment plant?
tab p_sewerage p_ptar if p_type_cat==1 , row //11% with a treatment plant
tab p_sewerage p_septank if p_type_cat==1  , row //0.6% with septic tank

lab var p_sewerage "Sewerage project"
lab var p_ptar "Sewerage project + treatment plant"

*Identify years of start and completion
*----------------------------------------
*Ever started between 2005-2013 (average 2 year construction)
sort ubigeo snip year
bysort ubigeo snip: gen p_sewerage_y0y=year if p_sewerage_y0[_n]==1 & p_sewerage_y0[_n-1]==0
bysort ubigeo snip: egen p_sewerage_y0y_max=max(p_sewerage_y0y)
lab var p_sewerage_y0y  "Year project started (flag)"
lab var p_sewerage_y0y_max "Year project started"

*Ever completed between 2005-2015
sort ubigeo snip year
bysort ubigeo snip: gen p_sewerage_y2y=year if p_sewerage_y2[_n]==1 & p_sewerage_y2[_n-1]==0
bysort ubigeo snip: egen p_sewerage_y2y_max=max(p_sewerage_y2y)
keep if p_sewerage_y0y_max>=2005 & p_sewerage_y0y_max<=2015
keep if year>=2005 & year<=2015
lab var p_sewerage_y2y "Year project completed (flag)"
lab var p_sewerage_y2y_max "Year project completed"


*Number of beneficiaries
*-----------------------------
sum p_popbenef, det
gen p_popbenef_rank=1 if p_popbenef<=`r(p25)'
replace p_popbenef_rank=2 if p_popbenef>`r(p25)' & p_popbenef<=`r(p50)'
replace p_popbenef_rank=3 if p_popbenef>`r(p50)' & p_popbenef<=`r(p75)'
replace p_popbenef_rank=4 if p_popbenef>`r(p75)'

lab def popbenef 1 "<p25" 2 "p25-p50" 3 "p50-p75" 4 ">p75"
lab val p_popbenef_rank popbenef

sum p_popbenef, det
gen p_popbenef_m=p_popbenef>=`r(p50)'

lab var p_popbenef "Estimated beneficiaries"
lab var p_popbenef_rank "Estimated beneficiaries (rank)"
lab var p_popbenef_m "Estimated beneficiaries (above or at median)"


*Halted projects
*----------------------
*Number of years without disbursement if < 100%
*% completion rate when halted?
*% projects not completed?
sort ubigeo snip year
bysort ubigeo snip: gen p_aband=1 if p_cum_perc[_n]==p_cum_perc[_n-1]
replace p_aband=. if p_cum_perc==0 | p_cum_perc>=1

bysort ubigeo snip: egen p_aband_tot=total(p_aband)
lab var p_aband_tot "Total years project is abandoned"

*Still halted by 2015:
gen p_aband_2015=1 if p_aband==1 & year==2015
replace p_aband_2015=. if p_cum_perc>=0.9
//2,572 out of 6,504 projects

*% disbursed mean p_cum_perc (final)
sum p_cum_perc if p_aband_2015==1

***Total money disbursed in halted projects
bysort ubigeo snip: egen p_aband_2015max=max(p_aband_2015)
sum devengado if p_aband_2015max==1
dis r(sum) 
//S/. 1,359,000,000 

*Ever halted?
gen p_aband_max=p_aband_tot>0 if p_aband_tot!=.
lab var p_aband_max "Project ever abandoned"


*Project halted - still active
sort ubigeo snip year
gen p_active=year>=p_sewerage_y0y_max & year<=p_sewerage_y2y_max 
replace p_aband=0 if p_aband==. & p_active==1

*Clonevar for collapse
clonevar p_aband_num=p_aband 
clonevar p_aband_num_max=p_aband_max

*Restrict completion rate to started sewerage projects
gen devx=1 if devengado!=0
replace p_cum_perc=. if p_cum_perc==0 //not started projects out of sample
replace p_cum_perc=1 if p_cum_perc>=1 & p_cum_perc!=. //projects capped at 100% of investment
replace p_cum_perc=. if p_sewerage!=1
drop devx

lab var p_aband    "Project halted"
lab var p_aband_2015 "Project halted in 2015"
lab var p_aband_2015max "Project halted by 2015"
lab var p_active "Project unfinished"
lab var p_aband_num "Project ever halted"

*Funding source
*----------------------
gen p_financ_tax=p_financ==3 if p_financ!=.
lab var p_financ_tax "Project financed by taxes"


save "$data/projects", replace


*------------------------------------------------------------------------------
*2) CREATE DISTRICT-YEAR LEVEL DATA
*------------------------------------------------------------------------------		

use "$data/projects", clear

*Get province ubigeo
gen province=substr(ubigeo,1,4)

*District level collapse
collapse (sum) p_sewerage_y0 p_sewerage_y1  p_sewerage_y2  p_active p_aband_num p_aband_num_max ///
(mean) costoactual p_financ_tax p_aband_max p_aband p_aband_tot (median) p_sewerage_t ///
, by(ubigeo province year)

*Rename variables
rename p_sewerage_y0 s0 
rename p_sewerage_y1 s1 
rename p_sewerage_y2 s2 
rename p_sewerage_t st 

*Label new variables
lab var costoactual "Mean investment"
lab var s0  "Cum. number of started sewerage"
lab var s1 "Num. sewerage in construction"
lab var s2 "Cum. number of completed sewerage"
lab var st "Median project duration"
lab var p_financ_tax "% financed by taxes"
lab var p_active "Number of projects active"
lab var p_aband "Share of projects halted (out of active)"
lab var p_aband_max "Share of total projects halted for at least 1 year"
lab var p_aband_tot "Average number of years projects are halted"
lab var p_aband_num "Number of projects halted each year"
lab var p_aband_num_max "Number of projects halted (ever)"

save "$data/mvcs_clean", replace


*------------------------------------------------------------------------------
*3) MERGE ALL DISTRICT-LEVEL DATA
*------------------------------------------------------------------------------		
use "$data/mvcs_clean", clear
tempfile mvcs_clean
save `mvcs_clean'

use "$data/district_setup", clear 
merge 1:1 ubigeo year using `mvcs_clean', gen(m_disproj)

*Districts never treated - no projects --> replace with 0
foreach var of varlist s0 s1 s2 {
replace `var'=0 if `var'==. & m_disproj==1
}

drop m_disproj

save "$data/analysis_district.dta", replace

erase "$data/mvcs_clean.dta"

*------------------------------------------------------------------------------
*4) GENERATE TREATMENT VARS
*------------------------------------------------------------------------------	

*Ever sewerage project?
bysort ubigeo: egen s0_max=max(s0)
bysort ubigeo: egen s1_max=max(s1)
bysort ubigeo: egen s2_max=max(s2)

lab var s0_max "Max projects started"
lab var s1_max "Max projects in construction"
lab var s2_max "Max projects completed"

*Construction ever happened?
gen treat_y0=s0_max!=0 

*Construction ever completed?
gen treat_y2=s2_max!=0

lab var treat_y0 "District ever started a project"
lab var treat_y2 "District ever completed a project"


*GENERATE IDs
*--------------------
*District ID
cap drop ubigeo_id
egen ubigeo_id = group(ubigeo) 
tsset ubigeo_id year

*Province ID
cap drop prov_id
egen prov_id = group(idprov) 

*Departamento ID
cap drop dep_id
sort departamento
gen dep=substr(ubigeo,1,2)
bysort dep: gen first= _n == 1
gen dep_id = sum(first)
drop first
//put Callao inside Lima
replace dep_id=15 if regexm(departamento, "CALLAO") & dep_id==7

lab var ubigeo_id "District ID - numeric"
lab var prov_id   "Province ID - numeric"
lab var dep_id    "Region ID - numeric"

*Drop duplicates of jurisdictional names
drop provincia nombprov province code_pp idprov ccpp
drop departamento dep nombdep iddpto
drop distrito nombdist iddist code_dd ccdd

save "$data/analysis_district.dta", replace


